# download_usd_llt.py
# USD_LLT LLT (Journal: Journal on Language and Language Teaching) Downloader
# Automates downloading PDFs from Sanata Dharma LLT journal
# - Combines metadata and TOC scraping
# - Skips Book Reviews, Full Issue, Editorials
# - Handles numeric and current issue URLs
# - Resolves missing metadata and duplicate galley links

import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

def sanitize(text):
    return re.sub(r'[\\/*?:"<>|]', "", text).strip()

issue_url = input("Enter LLT issue URL: ").strip()

resp = requests.get(issue_url, headers=HEADERS, allow_redirects=True)
soup = BeautifulSoup(resp.text, "html.parser")

parsed = urlparse(issue_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"

# Folder naming
title_tag = soup.find("title")
if title_tag:
    title_text = title_tag.get_text(strip=True)
    match = re.search(r"Vol\s*(\d+).*No\s*(\d+)", title_text)
    vol = match.group(1) if match else "Vol"
    iss = match.group(2) if match else "Issue"
else:
    vol, iss = "Vol", "Issue"

folder = sanitize(f"LLT_Vol{vol}_Issue{iss}")
os.makedirs(folder, exist_ok=True)

# Collect all article entries with galley links
article_entries = []
for row in soup.find_all("tr"):
    toc_title = row.find("div", class_="tocTitle")
    toc_galley = row.find("div", class_="tocGalleys")
    if toc_title and toc_galley:
        a_title = toc_title.find("a", href=True)
        a_pdf = toc_galley.find("a", href=True)
        if a_title and a_pdf:
            title = sanitize(a_title.get_text(strip=True))
            view_url = urljoin(base_url, a_title["href"])
            galley_url = urljoin(base_url, a_pdf["href"])
            pdf_url = galley_url.replace("/view/", "/download/")
            # Check if this row is inside a Book Review section
            is_book_review = False
            parent_table = row.find_parent("table")
            if parent_table:
                prev_h4 = parent_table.find_previous("h4", class_="tocSectionTitle")
                if prev_h4 and "book review" in prev_h4.get_text(strip=True).lower():
                    is_book_review = True
            article_entries.append((title, view_url, pdf_url, is_book_review))

print(f"Found {len(article_entries)} article entries")

count = 0

for title, view_url, pdf_url, is_book_review in article_entries:
    lower_title = title.lower()
    if is_book_review or "book review" in lower_title:
        print(f"[SKIP] Book Review: {title}")
        continue
    if "full issue" in lower_title:
        print(f"[SKIP] Full Issue: {title}")
        continue
    if lower_title.startswith("editorial"):
        print(f"[SKIP] Editorial: {title}")
        continue

    filename = f"{title}.pdf"
    path = os.path.join(folder, filename)

    if os.path.exists(path):
        print(f"[SKIP] Already downloaded: {filename}")
        continue

    print(f"[{count+1}] Downloading: {filename}")
    try:
        pdf = requests.get(pdf_url, headers=HEADERS)
        if "application/pdf" not in pdf.headers.get("Content-Type", ""):
            print(f"[SKIP] Not a PDF: {title}")
            continue
        with open(path, "wb") as f:
            f.write(pdf.content)
        count += 1
        print(f"[OK] Saved: {filename}")
    except Exception as e:
        print(f"[ERROR] Downloading {title}: {e}")

print(f"\nDone! {count} PDFs saved in {folder}")
